library(tidyverse)
# load data 
# set working directory here:
# read in csv file (all variables are loaded as character/string variables)
raw_conjoint <- read.csv("ConjointData_dec2019.csv", 
                         strip.white = T, stringsAsFactors = F, na.strings = c(""))

# delete the first two rows, which contain meta-data
raw_conjoint <- raw_conjoint %>% slice(-1:-2)

# remove the field name variables (e.g. "F.1.1.") that just say "Composition" (a quirk of survey programming)
conjoint <- select(raw_conjoint, -c(F.1.1, F.2.1, F.3.1, F.4.1))

# identify numeric variables that can be readily converted 
## (ratings outcomes need to be trimmed first, see below)
numeric_vars <- c("Duration..in.seconds.", "age")

# identify relevant factor variables  
factor_vars <- c("gender", "educ", "race", "faminc",
                 "choice1", "choice2", "choice3", "choice4",
                 "immigration", "aa_1", "aa_2", "aa_3", 
                 "born")

# convert class type for these variables
conjoint <- conjoint %>% mutate_at(vars(numeric_vars), list(as.numeric)) %>%
  mutate_at(vars(factor_vars), list(as.factor))

### more complex recoding

# for "ratings" variables", remove the text descriptions attached to "1" and "7" and recode to numeric
# write string extraction function
grab_first_char <- function(x){
  as.numeric(str_sub(x, 1, 2))
}
# apply to all variables that begin with "rating"
conjoint <- mutate_at(conjoint,
                      vars(starts_with("rating")),
                      grab_first_char)

# for replace_but_keep column (straightliners as flagged by Qualtrics), change NAs to 0s
conjoint$replace_but_keep[is.na(conjoint$replace_but_keep)] <- 0

# reorder the levels for some factor variables
conjoint <-
  conjoint %>% mutate(race = fct_relevel(race, "White", "Black/African American", 
                                         "Hispanic/Latino(a)", "Asian/Asian American"),
                      educ = fct_relevel(educ, "Less than high school", "High school/GED", 
                                         "2-year degree (Associate's)", "Some college",
                                         "4-year degree (Bachelor's)", "Graduate or professional degree"),
                      faminc = fct_relevel(faminc, "Less than $10,000", "$10,000 - $19,999",
                                           "$20,000 - $29,999", "$30,000 - $39,999", 
                                           "$40,000 - $49,999", "$50,000 - $59,999", 
                                           "$60,000 - $69,999", "$70,000 - $79,999",
                                           "$80,000 - $89,999", "$90,000 - $99,999", 
                                           "$100,000 - $149,999", "More than $150,000"),
                      born = fct_relevel(born, "United States",
                                         "U.S. territory (e.g., Puerto Rico)",
                                         "Some other country"))

############### RECODING ATTRIBUTES ################

# transform from wide to long, save as new dataset
# first gather only the "F.1.1.1" type variables 
data <- conjoint %>% gather(Neighborhood, Attribute, -c(StartDate:vote16, 
                                                        replace_but_keep)) %>%
  # then gather only the "rating1_A" type variables 
  gather(Which_rating, rating, -c(StartDate:choice1,
                                  choice2, choice3, choice4,
                                  betterplace:Attribute)) %>%
  # then gather only the "choice1" type variables  
  gather(Which_choice, choice, -c(StartDate:faminc,
                                  betterplace:rating)) %>%
  # this is going to create a bunch of rows where people's ratings/choices are not matched with 
  # the correct vignettes (e.g. a row might show the description of the Neighborhood A that 
  # was presented to someone in their first round of ratings).
  # with their rating of Neighborhood B from their second, third, and fourth rounds of ratings
  # so we want to filter to only rows that show the correct description/ratings/choice pairs
  filter(Neighborhood == "F.1.1.1" & Which_rating == "rating1_A" & Which_choice == "choice1" |
           Neighborhood == "F.1.2.1" & Which_rating == "rating1_B" & Which_choice == "choice1" |
           Neighborhood == "F.2.1.1" & Which_rating == "rating2_A" & Which_choice == "choice2" |
           Neighborhood == "F.2.2.1" & Which_rating == "rating2_B" & Which_choice == "choice2"|
           Neighborhood == "F.3.1.1" & Which_rating == "rating3_A" & Which_choice == "choice3"|
           Neighborhood == "F.3.2.1" & Which_rating == "rating3_B" & Which_choice == "choice3"|
           Neighborhood == "F.4.1.1" & Which_rating == "rating4_A" & Which_choice == "choice4"|
           Neighborhood == "F.4.2.1" & Which_rating == "rating4_B" & Which_choice == "choice4") %>%
  # sort by ResponseID -- now each response ID should have 8 rows, one rating for each 
  # neighborhood they saw and a dummy variable indicating if they selected that 
  # neighborhood as more diverse in a head-to-head matchup
  arrange(ResponseID) %>%
  #recode that choice dummy variable to 0/1 
  mutate(choice = as.numeric(case_when(
    Neighborhood == "F.1.1.1" & choice == "Neighborhood A" ~ 1,
    Neighborhood == "F.1.1.1" & choice == "Neighborhood B" ~ 0,
    Neighborhood == "F.1.2.1" & choice == "Neighborhood A" ~ 0,
    Neighborhood == "F.1.2.1" & choice == "Neighborhood B" ~ 1,
    Neighborhood == "F.2.1.1" & choice == "Neighborhood A" ~ 1,
    Neighborhood == "F.2.1.1" & choice == "Neighborhood B" ~ 0,
    Neighborhood == "F.2.2.1" & choice == "Neighborhood A" ~ 0,
    Neighborhood == "F.2.2.1" & choice == "Neighborhood B" ~ 1,
    Neighborhood == "F.3.1.1" & choice == "Neighborhood A" ~ 1,
    Neighborhood == "F.3.1.1" & choice == "Neighborhood B" ~ 0,
    Neighborhood == "F.3.2.1" & choice == "Neighborhood A" ~ 0,
    Neighborhood == "F.3.2.1" & choice == "Neighborhood B" ~ 1,
    Neighborhood == "F.4.1.1" & choice == "Neighborhood A" ~ 1,
    Neighborhood == "F.4.1.1" & choice == "Neighborhood B" ~ 0,
    Neighborhood == "F.4.2.1" & choice == "Neighborhood A" ~ 0,
    Neighborhood == "F.4.2.1" & choice == "Neighborhood B" ~ 1))) 
# new # of rows is 1999*8, or 15992 obs. 

# split strings in "Neighborhood" variable into three string variables: 
## field (just "F", will remove later), 
## task number (1-4) and 
## neighborhood 1 or 2 (recode to A or B later)
data <- data %>%separate(Neighborhood, c("Field", "task_no", "neighborhood")) %>%
  # similarly split "Attribute" string into four attributes
  separate(Attribute, c("composition", "largest_group", "comp2", "second_group", 
                        "and", "comp3", "third_group")) %>%
  # remove unnecessary variables
  select(-c(Field, comp2, and, comp3))

# make task_no numeric 
data$task_no <- as.numeric(data$task_no)

# recode neighborhood to A or B
data$neighborhood <- as.factor(data$neighborhood)
data$neighborhood <- recode(data$neighborhood, "1" = "A", "2" = "B")

# relabel composition 
data$composition <- as.factor(data$composition)
data$composition <- recode(data$composition,
                           "50" = "50/48/2", 
                           "60" = "60/38/2",
                           "70" = "70/28/2",
                           "80" = "80/18/2",
                           "90" = "90/8/2")

# create variable for omitted group based on which group is missing 
data <- data %>% mutate(omitted_group = case_when(
  largest_group != "White" & second_group != "White" & third_group != "White" ~ "White",
  largest_group != "Black" & second_group != "Black" & third_group != "Black" ~ "Black",
  largest_group != "Latino" & second_group != "Latino" & third_group != "Latino" ~ "Latino",
  largest_group != "Asian" & second_group != "Asian" & third_group != "Asian" ~ "Asian")) 

# turn largest_group and omitted_group into factors, set reference to "white"
data$largest_group <- as.factor(data$largest_group)
data$largest_group <- relevel(data$largest_group, ref = "White")
data$omitted_group <- as.factor(data$omitted_group)
data$omitted_group <- relevel(data$omitted_group, ref = "White")



# create numeric version of income variable
# top category
V <- (log(2192 + 1016) - log(1016))/(log(150000) - log(100000))
M <- 150000*0.5*(1 + (V/(V - 1)))
# recode
data <- data %>% mutate(inc_num = as.numeric(case_when
                                             (faminc == "Less than $10,000" ~ 5000 ,
                                               faminc == "$10,000 - $19,999" ~ 14999.5,
                                               faminc == "$20,000 - $29,999" ~ 24999.5,
                                               faminc == "$30,000 - $39,999" ~ 34999.5,
                                               faminc == "$40,000 - $49,999" ~ 44999.5,     
                                               faminc == "$50,000 - $59,999" ~ 54999.5,
                                               faminc == "$60,000 - $69,999" ~ 64999.5,
                                               faminc == "$70,000 - $79,999" ~ 74999.5, 
                                               faminc == "$80,000 - $89,999" ~ 84999.5,
                                               faminc == "$90,000 - $99,999" ~ 94999.5,
                                               faminc == "$100,000 - $149,999" ~ 124999.5,
                                               faminc == "More than $150,000" ~M)))
data$inc_num <- data$inc_num/1000

# create numeric and extended-factor version of AA and Immigration attitudinal variables
# AA as extended 5-factor variable (combining aa_1, aa_2, aa_3) and as continuous
data <- data %>% mutate(aa_continuous = as.numeric(case_when(
  aa_3 == "Strongly oppose" ~ 0,
  aa_3 == "Not strongly oppose" ~ 1,
  aa_1 == "Neither favor nor oppose" ~ 2,
  aa_2 == "Not strongly favor" ~ 3,
  aa_2 == "Strongly favor" ~ 4)),
  aa_factor = as.factor(case_when(
    aa_3 == "Strongly oppose" ~ "Strongly oppose",
    aa_3 == "Not strongly oppose" ~ "Not strongly oppose",
    aa_1 == "Neither favor nor oppose" ~ "Neither favor nor oppose",
    aa_2 == "Not strongly favor" ~ "Not strongly favor",
    aa_2 == "Strongly favor" ~ "Strongly favor")))

data$aa_factor <- fct_relevel(data$aa_factor,
                              c("Strongly oppose", 
                                "Not strongly oppose",
                                "Neither favor nor oppose",
                                "Not strongly favor",
                                "Strongly favor"))

# Immigration
# recode as continuous
data <- data %>% mutate(immigration_continuous = as.numeric(case_when(
  immigration == "Decreased" ~ 0,
  immigration == "Present level" ~ 1,
  immigration == "Increased" ~ 2)))

# relevel factor version
data$immigration_factor <- fct_relevel(data$immigration,
                                       c("Decreased",
                                         "Present level",
                                         "Increased"))

####################################################################################################3

# Creating two datasets:

# 1 Without "straightliners" (cases where replace_but_keep == 1) 
sample1 <- data %>% filter(replace_but_keep == 0)

# 2 With "straightliners" (the raw dataset that Qualtrics delivered to us)
sample2 <- data

